
import pandas as pd
import numpy as np

## Calculate lead inventor rank
## Zihao Li. 06/2024
dir = r'/Volumes/Zihao_SSD2/PatentsView/'

def main():
    print('Loading dataset: g_inventor_gender_race_age.csv...')
    df = pd.read_csv(dir + 'cleandata/g_inventor_gender_race_age.csv', encoding='utf-8', low_memory=False, usecols=['patent_id', 'inventor_name', 'inventor_sequence', 'patent_year', 'inventor_id', 'race80', 'gender_09_100'])

    # Drop non-numeric patents
    print('Dropping non-numeric patents...')
    df["patent_id"] = df["patent_id"].astype(str)
    df = df[~df["patent_id"].str.contains("[a-zA-Z]")].copy()
    df["patent_id"] = pd.to_numeric(df["patent_id"], errors="raise")
    print(df.shape) # (18981857, 7)

    df = df.sort_values(by=['patent_id', 'patent_year']).reset_index(drop=True)
    df['inventor_sequence'] = df['inventor_sequence'].astype('int64')
    lead_inventor_lst = list(df[df['inventor_sequence'] == 0]['inventor_id'].unique())
    df = df[df['inventor_id'].isin(lead_inventor_lst)] 
    print(df.shape) # (15351188, 7)

    # Calculate lead inventor cumsum of patents over years
    df_lead = df.groupby(['inventor_id', 'patent_year'])[['patent_id']].nunique().reset_index()
    df_lead = df_lead.sort_values(by=['inventor_id', 'patent_year'], ascending=[True, True]).reset_index(drop=True)
    df_lead = df_lead.rename(columns={'patent_id':'patent_count'})
    df_lead['patent_cumsum'] = df_lead.groupby('inventor_id')['patent_count'].transform('cumsum')
    print(df_lead.shape) (8366289, 4)

    # Assign percentiles for each year
    lst_percentiles = [0, 50, 80, 90, 95, 99, 100]
    for p in lst_percentiles:
        df_lead[f'perc_{str(p).replace(".0","")}'] = df_lead.groupby('patent_year')['patent_cumsum'].transform(lambda x: int(np.quantile(list(x),p/100)))

    # Assign bin for each year each inventor
    for i in range(len(lst_percentiles) - 1):
        lower_bound = f'perc_{str(lst_percentiles[i]).replace(".0","")}'
        upper_bound = f'perc_{str(lst_percentiles[i+1]).replace(".0","")}'
        df_lead.loc[
            (df_lead['patent_cumsum'] >= df_lead[lower_bound]) &
            (df_lead['patent_cumsum'] < df_lead[upper_bound]),
            'inventor_bin'
        ] = i + 1
    df_lead.loc[df_lead['patent_cumsum'] == df_lead[f'perc_{str(lst_percentiles[-1]).replace(".0","")}'], 'inventor_bin'] = len(lst_percentiles) - 1

    # Merge back to df
    df_cumsum = pd.merge(
        left = df,
        right = df_lead[['inventor_id', 'patent_year', 'patent_count', 'patent_cumsum','inventor_bin']],
        on = ['inventor_id','patent_year'],
        how = 'left'
    )
    df_cumsum = df_cumsum[(df_cumsum['patent_year']>=1981) & (df_cumsum['patent_year']<=2015)]
    df_cumsum = df_cumsum[df_cumsum['inventor_sequence']==0] # Keep the lead inventor for each patent
    df_cumsum.rename(columns={'patent_count': 'leadinventor_patent_count', 'patent_cumsum': 'leadinventor_patent_cumsum'}, inplace=True)

    df_cumsum = df_cumsum[['patent_id', 'leadinventor_patent_count', 'leadinventor_patent_cumsum', 'inventor_bin']]
    
    # Export dataset
    print('Exporting dataset...')
    df_cumsum.to_stata(dir + 'temp/leadinventor_rank.dta', write_index=False)


if __name__ == "__main__":
    main()